library(tidyverse)
movies <- read.csv("archive/tmdb_5000_movies.csv")
names(movies)
[1] "budget" "genres" "homepage" "id"
[5] "keywords" "original_language" "original_title" "overview"
[9] "popularity" "production_companies" "production_countries" "release_date"
[13] "revenue" "runtime" "spoken_languages" "status"
[17] "tagline" "title" "vote_average" "vote_count"
1
First thoughts - homepage - missing values - revenue - 0 value - few factor types - popularity - num with decimal shouldn’t be double? - lot of repetitive values
2
movies_selected <-
movies %>%
select(title,runtime, budget)
3
Ok, now take your subsetted data movies_selected, and count the number of missing values in each column.
4
movies_runtime<-
movies %>%
mutate(runtime = na_if(runtime, 0))
movies %>%
summarise(sum(is.na(runtime)))
5
movies_imputed <- movies %>%
mutate(runtime = coalesce(runtime, median(runtime, na.rm = TRUE)))
movies %>%
summarise(sum(is.na(runtime)))
6
movies_imputed %>%
slice_min(runtime, n = 10)
2
Take the original dataset with all the variables. Using across and where, summarise the number of missing values, first across all columns of type character, and then across all columns of type numeric.
movies %>%
group_by(across(where(is.numeric))) %>%
summarise(sum(is.na = T))
`summarise()` regrouping output by 'budget', 'id', 'popularity', 'revenue', 'runtime', 'vote_average' (override with `.groups` argument)
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQoKbW92aWVzIDwtIHJlYWQuY3N2KCJhcmNoaXZlL3RtZGJfNTAwMF9tb3ZpZXMuY3N2IikKYGBgCmBgYHtyfQpoZWFkKG1vdmllcykKYGBgCmBgYHtyfQpuYW1lcyhtb3ZpZXMpCmBgYAojIDEKRmlyc3QgdGhvdWdodHMKLSBob21lcGFnZSAtIG1pc3NpbmcgdmFsdWVzCi0gcmV2ZW51ZSAtIDAgdmFsdWUKLSBmZXcgZmFjdG9yIHR5cGVzCi0gcG9wdWxhcml0eSAtIG51bSB3aXRoIGRlY2ltYWwgc2hvdWxkbid0IGJlIGRvdWJsZT8KLSBsb3Qgb2YgcmVwZXRpdGl2ZSB2YWx1ZXMKCiMgMgoKYGBge3J9Cm1vdmllc19zZWxlY3RlZCA8LQptb3ZpZXMgJT4lIAogIHNlbGVjdCh0aXRsZSxydW50aW1lLCBidWRnZXQpCmBgYAojIDMKT2ssIG5vdyB0YWtlIHlvdXIgc3Vic2V0dGVkIGRhdGEgbW92aWVzX3NlbGVjdGVkLCBhbmQgY291bnQgdGhlIG51bWJlciBvZiBtaXNzaW5nIHZhbHVlcyBpbiBlYWNoIGNvbHVtbi4KCmBgYHtyfQojIGNvdW50IHRoZSBudW1iZXIgb2YgbWlzc2luZyB2YWx1ZXMgaW4gZWFjaCBjb2x1bW4KbW92aWVzX3NlbGVjdGVkICU+JSAKICBzdW1tYXJpc2UoY291bnQgPSBzdW0oaXMubmEobW92aWVzX3NlbGVjdGVkKSkpCmBgYAoKIyA0CgpgYGB7cn0KbW92aWVzICU+JSAKICBmaWx0ZXIocnVudGltZSA9PSAwKSAlPiUgCiAgc3VtbWFyaXNlKGNvdW50ID0gc3VtKHJ1bnRpbWUgPT0gMCkpCmBgYAoKCmBgYHtyfQptb3ZpZXNfcnVudGltZTwtCm1vdmllcyAlPiUKICBtdXRhdGUocnVudGltZSA9IG5hX2lmKHJ1bnRpbWUsIDApKQogIApgYGAKYGBge3J9Cm1vdmllcyAlPiUgCiAgc3VtbWFyaXNlKHN1bShpcy5uYShydW50aW1lKSkpCmBgYAoKIyA1CgpgYGB7cn0KbW92aWVzX2ltcHV0ZWQgPC0gbW92aWVzICU+JSAKICBtdXRhdGUocnVudGltZSA9IGNvYWxlc2NlKHJ1bnRpbWUsIG1lZGlhbihydW50aW1lLCBuYS5ybSA9IFRSVUUpKSkKCmBgYApgYGB7cn0KbW92aWVzICU+JSAKICBzdW1tYXJpc2Uoc3VtKGlzLm5hKHJ1bnRpbWUpKSkKYGBgCgojIDYgCgpgYGB7cn0KbW92aWVzX2ltcHV0ZWQgJT4lIAogIHNsaWNlX21heChydW50aW1lLCBuID0gMTApCmBgYApgYGB7cn0KbW92aWVzX2ltcHV0ZWQgJT4lIAogIHNsaWNlX21pbihydW50aW1lLCBuID0gMTApCmBgYAoKIyA3CgpgYGB7cn0KbW92aWVzJT4lIAogIG11dGF0ZShtb3ZpZXNfaW1wdXRlZCA9IGlmX2Vsc2UoYnVkZ2V0IDwgMTAwLCBtZWRpYW4oYnVkZ2V0KSwgYnVkZ2V0KSkKYGBgCgojIEV4dGVuc2lvbiAxCgpgYGB7cn0KbW92aWVfYnVkZ2V0cyA8LSBtb3ZpZXMgJT4lIAogIG11dGF0ZShidWRnZXRfdHlwZSA9IGNhc2Vfd2hlbihidWRnZXQgPCAxMmU2IH4gIlNtYWxsIGJ1ZGdldCIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGJ1ZGdldCA+IDEyZTYgJiBidWRnZXQgPCA0MGU2IH4gIk1lZGl1bSBidWRnZXQiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBidWRnZXQgPiA0MGU2IH4gIkJpZyBidWRnZXQiKSkKbW92aWVfYnVkZ2V0cwpgYGAKIyAyClRha2UgdGhlIG9yaWdpbmFsIGRhdGFzZXQgd2l0aCBhbGwgdGhlIHZhcmlhYmxlcy4gVXNpbmcgYWNyb3NzIGFuZCB3aGVyZSwgc3VtbWFyaXNlIHRoZSBudW1iZXIgb2YgbWlzc2luZyB2YWx1ZXMsIGZpcnN0IGFjcm9zcyBhbGwgY29sdW1ucyBvZiB0eXBlIGNoYXJhY3RlciwgYW5kIHRoZW4gYWNyb3NzIGFsbCBjb2x1bW5zIG9mIHR5cGUgbnVtZXJpYy4KYGBge3J9Cm1vdmllcyAlPiUgCiAgZ3JvdXBfYnkoYWNyb3NzKHdoZXJlKGlzLmNoYXJhY3RlcikpKSAlPiUgCiAgc3VtbWFyaXNlKHN1bShpcy5uYSA9IFQpKQpgYGAKYGBge3J9Cm1vdmllcyAlPiUgCiAgZ3JvdXBfYnkoYWNyb3NzKHdoZXJlKGlzLm51bWVyaWMpKSkgJT4lIAogIHN1bW1hcmlzZShzdW0oaXMubmEgPSBUKSkKYGBgCgo=